In [56]:
import polars as pl
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'

# for Df
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset

Exercise to use polars with scikit-learn, Pytorch, Plotly¶

  • using residual as my label instead of quality because for fun
  • not an exercise to create the best model - just for polars integration
In [57]:
## Importing the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

raw_data = pl.read_csv(url, separator=';', infer_schema_length=10000)

raw_data.head()
Out[57]:
shape: (5, 12)
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
f64f64f64f64f64f64f64f64f64f64f64i64
7.40.70.01.90.07611.034.00.99783.510.569.45
7.80.880.02.60.09825.067.00.99683.20.689.85
7.80.760.042.30.09215.054.00.9973.260.659.85
11.20.280.561.90.07517.060.00.9983.160.589.86
7.40.70.01.90.07611.034.00.99783.510.569.45
In [58]:
raw_data.schema
Out[58]:
Schema([('fixed acidity', Float64),
        ('volatile acidity', Float64),
        ('citric acid', Float64),
        ('residual sugar', Float64),
        ('chlorides', Float64),
        ('free sulfur dioxide', Float64),
        ('total sulfur dioxide', Float64),
        ('density', Float64),
        ('pH', Float64),
        ('sulphates', Float64),
        ('alcohol', Float64),
        ('quality', Int64)])
In [59]:
# check the describe
raw_data.describe()
Out[59]:
shape: (9, 13)
statisticfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
strf64f64f64f64f64f64f64f64f64f64f64f64
"count"1599.01599.01599.01599.01599.01599.01599.01599.01599.01599.01599.01599.0
"null_count"0.00.00.00.00.00.00.00.00.00.00.00.0
"mean"8.3196370.5278210.2709762.5388060.08746715.87492246.4677920.9967473.3111130.65814910.4229835.636023
"std"1.7410960.179060.1948011.4099280.04706510.46015732.8953240.0018870.1543860.1695071.0656680.807569
"min"4.60.120.00.90.0121.06.00.990072.740.338.43.0
"25%"7.10.390.091.90.077.022.00.99563.210.559.55.0
"50%"7.90.520.262.20.07914.038.00.996753.310.6210.26.0
"75%"9.20.640.422.60.0921.062.00.997843.40.7311.16.0
"max"15.91.581.015.50.61172.0289.01.003694.012.014.98.0
In [60]:
from polars import selectors as cs
raw_data.select(cs.by_dtype(pl.Float64)).drop('residual sugar').columns
Out[60]:
['fixed acidity',
 'volatile acidity',
 'citric acid',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']
In [61]:
# convert replace the " " with "_"
raw_data = raw_data.rename({col: col.replace(" ", "_") for col in raw_data.columns})
In [62]:
# det a description of the sugar & can ask the question can be create a binary data with balance sample number
raw_data['residual_sugar'].describe()
Out[62]:
shape: (9, 2)
statisticvalue
strf64
"count"1599.0
"null_count"0.0
"mean"2.538806
"std"1.409928
"min"0.9
"25%"1.9
"50%"2.2
"75%"2.6
"max"15.5
In [63]:
# plot a histogram to see the distribution of the residual_sugar
fig = px.histogram(raw_data, x='residual_sugar', nbins=10)

fig.update_layout(template='ggplot2', width=600, bargap=0.2)
fig.show()
In [64]:
residual_sugar_cat = raw_data.select(pl.when(pl.col('residual_sugar') > 4).then(pl.lit('High')).otherwise(pl.lit('Low'))).to_series().to_list()

# inspect the dataset
fig = px.scatter_matrix(
    raw_data,
    dimensions=raw_data.select(cs.by_dtype(pl.Float64)).drop('residual_sugar').columns,
    color=residual_sugar_cat
)

fig.update_layout(template='ggplot2', width=1000, height=1000, font_size=8)
fig.update_traces(marker=dict(size=3, opacity=0.8, line=dict(width=0.2, color='darkslategrey')))
In [65]:
# no relationship between quality and residual sugar
fig = px.scatter(
    raw_data,
    y='residual_sugar',
    x='quality'
)

fig.update_layout(template='ggplot2', width=500)

Interpretation¶

  • initially I wanted to bin the residual sugar so I could predict "High" or "Low" residual sugar . However, from the exploration, there is no obvious relationship hence I dropped that idea.

Prepare the data¶

In [66]:
scaler = StandardScaler()

# drop the residual sugar
features = raw_data.drop('residual_sugar')

# fit transform the feature
scaled_features = scaler.fit_transform(features)

# create a polars dataframe from the scaled data
scaled_data = pl.DataFrame(scaled_features, schema=features.columns)

# add the residual sugar back
scaled_data = scaled_data.with_columns(raw_data['residual_sugar'])
In [67]:
# inspect the scaled data
scaled_data
Out[67]:
shape: (1_599, 12)
fixed_acidityvolatile_aciditycitric_acidchloridesfree_sulfur_dioxidetotal_sulfur_dioxidedensitypHsulphatesalcoholqualityresidual_sugar
f64f64f64f64f64f64f64f64f64f64f64f64
-0.528360.961877-1.391472-0.243707-0.466193-0.3791330.5582741.288643-0.579207-0.960246-0.7878231.9
-0.2985471.967442-1.3914720.2238750.8726380.6243630.028261-0.7199330.12895-0.584777-0.7878232.6
-0.2985471.297065-1.186070.096353-0.0836690.2290470.134264-0.331177-0.048089-0.584777-0.7878232.3
1.654856-1.3844431.484154-0.264960.1075920.41150.664277-0.979104-0.46118-0.5847770.4508481.9
-0.528360.961877-1.391472-0.243707-0.466193-0.3791330.5582741.288643-0.579207-0.960246-0.7878231.9
………………………………
-1.2177960.403229-0.9806690.0538451.542054-0.075043-0.9787650.899886-0.461180.072294-0.7878232.0
-1.3901550.123905-0.877968-0.5412592.2114690.13782-0.8621621.3534360.6010550.7293640.4508482.2
-1.160343-0.099554-0.723916-0.2437071.255161-0.196679-0.5335540.7055080.5420420.541630.4508482.3
-1.3901550.65462-0.775267-0.264961.542054-0.075043-0.6766571.67740.30599-0.209308-0.7878232.0
-1.332702-1.2168491.021999-0.434990.203223-0.135861-0.6660570.511130.0109240.541630.4508483.6
In [68]:
corr_labels = scaled_data.columns

fig = px.imshow(
    scaled_data.corr(),
    y=corr_labels,
    text_auto='.1f',
    zmin=-1,
    zmax=1,
    color_continuous_scale='RdBu'
)

fig.update_xaxes(side='top')
fig.update_layout(template='ggplot2', width=700, height=700)
In [69]:
dataT = torch.tensor(scaled_data.drop('residual_sugar').to_numpy()).float()
labelT = torch.tensor(scaled_data['residual_sugar'].to_numpy()).float().unsqueeze(1) # unsqueeze to convert to 2D
In [70]:
print(labelT.shape)
print(dataT.shape)
torch.Size([1599, 1])
torch.Size([1599, 11])

Partition the data and loading into DataLoader¶

In [71]:
X_train, X_test, y_train, y_test = train_test_split(dataT, labelT, test_size=0.3, random_state=42)
In [72]:
# create pytorch dataset
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
In [73]:
# translate the data load objects
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, drop_last=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0])
In [74]:
# check for the shape and dimension of the training tensor data stored in DataLoader object
train_loader.dataset.tensors[0].shape
Out[74]:
torch.Size([1119, 11])
In [75]:
test_loader.dataset.tensors[0].shape
Out[75]:
torch.Size([480, 11])
In [76]:
print(raw_data.shape)
1439 + 160
(1599, 12)
Out[76]:
1599

Create a the model class¶

In [77]:
class ANN(nn.Module):
    def __init__(self):
        super().__init__()
        # create the input layer
        self.input = nn.Linear(11, 16)
        # create the hidden layers
        self.hidden1 = nn.Linear(16, 32)
        self.hidden2 = nn.Linear(32, 32)
        # create the output layer
        self.output = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))

        return self.output(x)

Create a function to train the model¶

In [78]:
def trainModel(model, train_loader, test_loader, numepochs=200, learning_rate=0.01):
    # define an empty dataframe
    # doesnt need bracket because it is with a schema defination
    training_progess = pl.DataFrame(
        schema={
            'iteration' : pl.Int64,
            'epoch' : pl.Int64,
            'batch_train_loss': pl.Float64,
            'batch_test_loss' : pl.Float64
        }
    )
    # define the loss func and optimizer
    lossfunc = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    iteration = 0 # technically this is redundant because I am not testing multiple batch_size

    for epoch in range(numepochs):
        model.train()
        for X, y in train_loader:
            yhat = model(X)
            loss = lossfunc(yhat, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # evaluate the test
            model.eval()
            X_test, y_test = next(iter(test_loader))
            with torch.no_grad():
                yhat_test = model(X_test)
                loss_test = lossfunc(yhat_test, y_test)

            # switch batch to training
            model.train()

            # record the results in this iteration
            # remember to wrappe dthis in a list because polars expacts an iterable of records when creating dataframe
            new_row = pl.DataFrame([{
                'iteration' : iteration,
                'epoch' : epoch,
                'batch_train_loss' : loss.item(),
                'batch_test_loss' : loss_test.item()

            }])
            training_progess = pl.concat([training_progess, new_row])
            # move the iteration up at the end of this loop
            iteration += 1

    return training_progess

Note about dataframe creating in polars¶

  • the square brackets [] around the dictionary is because we are providing a list containing a single dictionary, where each dictionary represents one row of data. This approach is used when you want to create a DataFrame with specific rows of data
  • The [{...}] syntax means "a list containing one dictionary" - that dictionary becomes one row in your DataFrame.
In [79]:
# create a model and run it
# model = ANN()

# test the model
# model(torch.randn(10, 11)).shape

train the model¶

In [80]:
# create model class
model = ANN()

# train the model
training_progess = trainModel(model, train_loader, test_loader, numepochs=200, learning_rate=0.01)
In [81]:
# check the training progres
print(training_progess)
shape: (6_800, 4)
┌───────────┬───────┬──────────────────┬─────────────────┐
│ iteration ┆ epoch ┆ batch_train_loss ┆ batch_test_loss │
│ ---       ┆ ---   ┆ ---              ┆ ---             │
│ i64       ┆ i64   ┆ f64              ┆ f64             │
╞═══════════╪═══════╪══════════════════╪═════════════════╡
│ 0         ┆ 0     ┆ 8.357491         ┆ 7.801715        │
│ 1         ┆ 0     ┆ 7.378495         ┆ 7.452112        │
│ 2         ┆ 0     ┆ 6.096545         ┆ 7.126534        │
│ 3         ┆ 0     ┆ 13.573361        ┆ 6.742377        │
│ 4         ┆ 0     ┆ 9.127261         ┆ 6.389193        │
│ …         ┆ …     ┆ …                ┆ …               │
│ 6795      ┆ 199   ┆ 0.182057         ┆ 0.601537        │
│ 6796      ┆ 199   ┆ 0.159327         ┆ 0.597312        │
│ 6797      ┆ 199   ┆ 0.110114         ┆ 0.606896        │
│ 6798      ┆ 199   ┆ 0.657421         ┆ 0.616068        │
│ 6799      ┆ 199   ┆ 0.103466         ┆ 0.618086        │
└───────────┴───────┴──────────────────┴─────────────────┘
In [82]:
# groupby analysis of each epoch to have less granular data
grouped_epoch_training = (
    training_progess.group_by('epoch')
    .agg(
        # pl.mean('batch_train_loss').alias('batch_train_loss_mean'),
        # pl.mean('batch_test_loss').alias('batch_test_loss_mean'),
        batch_train_loss_mean=pl.mean('batch_train_loss'),
        batch_test_loss_mean=pl.mean('batch_test_loss')
    )
)

grouped_epoch_training.head()
Out[82]:
shape: (5, 3)
epochbatch_train_loss_meanbatch_test_loss_mean
i64f64f64
04.4690633.823411
12.0408081.728339
21.9097081.631237
31.8501191.554289
41.681041.477889

Plot the training and test performance¶

In [83]:
fig = px.scatter(
    data_frame=grouped_epoch_training,
    x='epoch',
    y=['batch_train_loss_mean', 'batch_test_loss_mean'],
    labels={'variable' : 'Performance Metrics', 'value' : 'Loss'}
)

fig.update_layout(template='ggplot2', width=600)
fig.show()
In [84]:
training_progess.head()
Out[84]:
shape: (5, 4)
iterationepochbatch_train_lossbatch_test_loss
i64i64f64f64
008.3574917.801715
107.3784957.452112
206.0965457.126534
3013.5733616.742377
409.1272616.389193
In [85]:
fig = px.scatter(
    data_frame=training_progess,
    x='iteration',
    y=['batch_train_loss', 'batch_test_loss'],
    labels={'variable': 'Performance Metrics', 'value': 'Loss'}
)

fig.update_layout(template='ggplot2', width=600)
fig.show()

Compare the predictions between train and test dataset¶

In [86]:
model.eval()

# using the trained model
with torch.no_grad():
    yHatTrain = model(X_train)
    yHatTest = model(X_test)

print(yHatTest.shape)
print(yHatTrain.shape)
torch.Size([480, 1])
torch.Size([1119, 1])
In [87]:
# Convert PyTorch tensors to Polars DataFrames directly
train_df = pl.DataFrame({
    'True': y_train.squeeze().cpu().numpy(),
    'Predicted': yHatTrain.squeeze().cpu().numpy(),
    'Set': ['Train'] * len(y_train)
})

test_df = pl.DataFrame({
    'True': y_test.squeeze().cpu().numpy(),
    'Predicted': yHatTest.squeeze().cpu().numpy(),
    'Set': ['Test'] * len(y_test)
})

# Concatenate using Polars' concat function
df_pred = pl.concat([train_df, test_df])
In [88]:
df_pred
Out[88]:
shape: (1_599, 3)
TruePredictedSet
f32f32str
1.91.91981"Train"
2.02.319143"Train"
2.63.483783"Train"
2.12.209713"Train"
2.22.164838"Train"
………
2.13.494391"Test"
2.63.069432"Test"
1.72.003307"Test"
2.43.661964"Test"
2.32.020392"Test"
In [89]:
fig = px.scatter(
    df_pred,
    x='Predicted',
    y='True',
    color='Set',
    symbol='Set',
    title='Model Predictions vs. True Values',
    labels={'Predicted': 'Predicted Residual Sugar', 'True': 'True Residual Sugar'},
    template='ggplot2'
)

fig.update_layout(template='ggplot2', width=600)
fig.update_traces(marker=dict(line=dict(width=0.4), opacity=0.7))